@inproceedings{gao-etal-2025-decompilebench,
title = "{D}ecompile{B}ench: A Comprehensive Benchmark for Evaluating Decompilers in Real-World Scenarios",
author = "Gao, Zeyu and
Cui, Yuxin and
Wang, Hao and
Qin, Siliang and
Wang, Yuanda and
Bolun, Zhang and
Zhang, Chao",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1194/",
doi = "10.18653/v1/2025.findings-acl.1194",
pages = "23250--23267",
ISBN = "979-8-89176-256-5",
abstract = "Decompilers are fundamental tools for critical security tasks, from vulnerability discovery to malware analysis, yet their evaluation remains fragmented. Existing approaches primarily focus on syntactic correctness through synthetic micro-benchmarks or subjective human ratings, failing to address real-world requirements for semantic fidelity and analyst usability. We present **DecompileBench**, the first comprehensive framework that enables effective evaluation of decompilers in reverse engineering workflows through three key components: real-world function extraction (comprising 23,400 functions from 130 real-world programs), runtime-aware validation, and automated human-centric assessment using LLM-as-Judge to quantify the effectiveness of decompilers in reverse engineering workflows. Through a systematic comparison between six industrial-strength decompilers and six recent LLM-powered approaches, we demonstrate that LLM-based methods surpass commercial tools in code understandability despite 52.2{\%} lower functionality correctness. These findings highlight the potential of LLM-based approaches to transform human-centric reverse engineering. We open source **DecompileBench** to provide a framework to advance research on decompilers and assist security experts in making informed tool selections based on their specific requirements."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gao-etal-2025-decompilebench">
<titleInfo>
<title>DecompileBench: A Comprehensive Benchmark for Evaluating Decompilers in Real-World Scenarios</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zeyu</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuxin</namePart>
<namePart type="family">Cui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siliang</namePart>
<namePart type="family">Qin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuanda</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhang</namePart>
<namePart type="family">Bolun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chao</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Decompilers are fundamental tools for critical security tasks, from vulnerability discovery to malware analysis, yet their evaluation remains fragmented. Existing approaches primarily focus on syntactic correctness through synthetic micro-benchmarks or subjective human ratings, failing to address real-world requirements for semantic fidelity and analyst usability. We present **DecompileBench**, the first comprehensive framework that enables effective evaluation of decompilers in reverse engineering workflows through three key components: real-world function extraction (comprising 23,400 functions from 130 real-world programs), runtime-aware validation, and automated human-centric assessment using LLM-as-Judge to quantify the effectiveness of decompilers in reverse engineering workflows. Through a systematic comparison between six industrial-strength decompilers and six recent LLM-powered approaches, we demonstrate that LLM-based methods surpass commercial tools in code understandability despite 52.2% lower functionality correctness. These findings highlight the potential of LLM-based approaches to transform human-centric reverse engineering. We open source **DecompileBench** to provide a framework to advance research on decompilers and assist security experts in making informed tool selections based on their specific requirements.</abstract>
<identifier type="citekey">gao-etal-2025-decompilebench</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1194</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1194/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>23250</start>
<end>23267</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DecompileBench: A Comprehensive Benchmark for Evaluating Decompilers in Real-World Scenarios
%A Gao, Zeyu
%A Cui, Yuxin
%A Wang, Hao
%A Qin, Siliang
%A Wang, Yuanda
%A Bolun, Zhang
%A Zhang, Chao
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F gao-etal-2025-decompilebench
%X Decompilers are fundamental tools for critical security tasks, from vulnerability discovery to malware analysis, yet their evaluation remains fragmented. Existing approaches primarily focus on syntactic correctness through synthetic micro-benchmarks or subjective human ratings, failing to address real-world requirements for semantic fidelity and analyst usability. We present **DecompileBench**, the first comprehensive framework that enables effective evaluation of decompilers in reverse engineering workflows through three key components: real-world function extraction (comprising 23,400 functions from 130 real-world programs), runtime-aware validation, and automated human-centric assessment using LLM-as-Judge to quantify the effectiveness of decompilers in reverse engineering workflows. Through a systematic comparison between six industrial-strength decompilers and six recent LLM-powered approaches, we demonstrate that LLM-based methods surpass commercial tools in code understandability despite 52.2% lower functionality correctness. These findings highlight the potential of LLM-based approaches to transform human-centric reverse engineering. We open source **DecompileBench** to provide a framework to advance research on decompilers and assist security experts in making informed tool selections based on their specific requirements.
%R 10.18653/v1/2025.findings-acl.1194
%U https://aclanthology.org/2025.findings-acl.1194/
%U https://doi.org/10.18653/v1/2025.findings-acl.1194
%P 23250-23267
Markdown (Informal)
[DecompileBench: A Comprehensive Benchmark for Evaluating Decompilers in Real-World Scenarios](https://aclanthology.org/2025.findings-acl.1194/) (Gao et al., Findings 2025)
ACL